/**************************************************************************
* This programs makes the dataset to be used in the analysis.
*
* Background: Interest-only (IO) loans were introduced in Denmark in 2003.
*
* Project: 	Mortgage Choice and Expenditure over the Lifecycle:
*			Evidence from Expiring Interest-Only Loans
*
* Project participants: Henrik Yde Andersen, HYA
*						Stine Ludvig Bech, SLB
*						Alessia De Stefani, ADS
*					
* Date:	October 2020
**************************************************************************/

dm 'cle log';
dm 'cle out';
dm 'keydef f12 "cle log; cle out; submit"';

proc datasets lib=work kill memtype=data;
run;

libname save 	"E:\ProjektDB\706172\Workdata\706172\Husholdningsprojekt\Mortgage Choice and Expenditure\Data";
libname databib "E:\ProjektDB\706172\Workdata\706172\Datas�t";
libname ekstern "E:\ProjektDB\706172\DST_rawdata_706172\Eksterndata";
libname raw		"E:\ProjektDB\706172\DST_rawdata_706172";

libname fmt  '\\srvfsenas1\data\formater\SAS formater i Danmarks Statistik\FORMATKATALOG';
options fmtsearch=(fmt.times_personstatistik fmt.brancher fmt.uddannelser fmt.geokoder work egnefmt);

**************************************************************************
1) Raw data
	- REAL 2009 - 2017 (measured 31st December)
	- Befolkning 2010 - 2018 (measured 1st January)
	- Familiedatas�t 1998 - 2017
**************************************************************************;

%let startaar		= 2009;
%let startaar_p1	= 2010;
%let slutaar		= 2017;

%macro raadata;
*MORTGAGE DATA;
	%do i=&startaar %to &slutaar;
		data real&i;
			set raw.real&i;
*Correcting a mistake in data where date of loan uptake is after beginning of IO period;
	if afstart ne 0 and laandato > afstart then laandato = afstart;
*Date variables;
	*Formatting dates for the IO period (afstart=IO start, afslut=IO end);
		if afstart ne 0 then afstart_dato = input(afstart,ddmmyy10.);
		if afslut ne 0 then afslut_dato = input(afslut,ddmmyy10.);
			format afstart_dato afslut_dato ddmmyy10.;
	*Year variables;
		y_laanoptag = year(laandato); 	*Year of loan uptake;
		y_afstart = year(afstart_dato); *Year of IO start;
		y_afslut = year(afslut_dato); 	*Year of IO end;
	*Year and months variables;
		ym_laanoptag = laandato;		*Year and month of loan uptake;
		ym_afstart = afstart_dato;		*Year and month of IO start;
		ym_afslut = afslut_dato;		*Year and month of IO end;
			format ym_laanoptag ym_afstart ym_afslut yymm6.;
	*Length of IO period measured in years and months (l_afdragsfrihed = length of IO period, md=months and aar=years);
		if afslut_dato ne . and afstart_dato ne . then l_afdragsfrihed_md = intck("month",afstart_dato,afslut_dato);	
		if afslut_dato ne . and afstart_dato ne . then l_afdragsfrihed_aar = intck("year",afstart_dato,afslut_dato);
	*Variable indicating whether a new loan was taken in year t incl. loan characteristics (IO, type of interest).Later grouped on household level;
		*New loan;
		nytlaan = 0;
			if y_laanoptag = &i then nytlaan = 1;
		*New loan - IO or non IO;
		nytlaan_uden_afdrag = 0;
			if y_laanoptag = &i and y_afstart = &i then nytlaan_uden_afdrag = 1;
		nytlaan_med_afdrag = 0;
			if y_laanoptag = &i and y_afstart ne &i then nytlaan_med_afdrag = 1;
		*New loan - fixed or variable interest (interest is defined by 'rentebinding'. If the interest is fixed in more than 10 years then fixed interest rate.;
		nytlaan_var = 0;
			if y_laanoptag = &i and rntbind <= 10 then nytlaan_var = 1;
		nytlaan_fast = 0;
			if y_laanoptag = &i and rntbind > 10 then nytlaan_fast = 1;
		*New loan IO or non IO, fixed or variable interest;
		nytlaan_var_uden_afdrag = 0;
			if y_laanoptag = &i and rntbind <= 10 and y_afstart = &i then nytlaan_var_uden_afdrag = 1;
		nytlaan_var_med_afdrag = 0;
			if y_laanoptag = &i and rntbind <= 10 and y_afstart ne &i then nytlaan_var_med_afdrag = 1;
		nytlaan_fast_uden_afdrag = 0;
			if y_laanoptag = &i and rntbind > 10 and y_afstart = &i then nytlaan_fast_uden_afdrag = 1;
		nytlaan_fast_med_afdrag = 0;
			if y_laanoptag = &i and rntbind > 10 and y_afstart ne &i then nytlaan_fast_med_afdrag = 1;

		proc sort; by laannr pnr;
		run;
	%end;

%mend raadata;
%raadata; 

%macro overlev;
*DEFINING WHETHER THE IO LOAN HAS 'SURVIVED' FROM LAST YEAR;
*We code the variable overlev_lastyear = 1 in year t if the loan is in the dataset in year t-1.;
%do i = &startaar+1 %to &slutaar;
	%let j = %eval(&i-1);
		data overlev&i;
			merge real&i (in = a keep = laannr pnr)
				  real&j (in = b keep = laannr pnr y_laanoptag ym_laanoptag ym_afstart l_afdragsfrihed_aar hovedobl lobetid ltv);
			by laannr pnr;
			if a and b then overlev_lastyear = 1;
			if b and not a then overlev_lastyear = 0;
			if a and not b then overlev_lastyear = 2; *New loan and/or debtor in year t;
			treat_aar = &i;
			aar = &i;
		run;
		proc freq data=overlev&i; 
			table overlev_lastyear*treat_aar / nopercent norow; 
		run;
%end;
%mend overlev;
%overlev;

%macro sample;
*MAKING THE SAMPLE. CONSISTS OF ALL LOANS THAT START WITH AN IO PERIOD AND CONTINUES THE IO PERIOD FOR AT LEAST NINE YEARS;
%do i = &startaar+1 %to &slutaar;
	proc sort data=overlev&i; 
		by aar pnr descending hovedobl; 
	run; 

	data sample_&i;
		set overlev&i (where = (y_laanoptag = (&i-10) and l_afdragsfrihed_aar ge 9)); *Dataset conditioned on loan date in year t-10 and length of IO on 9 years or more;
		by aar pnr descending hovedobl;
	run;
	proc freq data=sample_&i; 
		table overlev_lastyear*treat_aar / norow nopercent; 
	run;
%end;
%mend sample;
%sample;

%macro bef;
*POPULATION DATA - TO GET FAMILY ID;
	%do i = &startaar %to &slutaar;
	%let j = %eval(&i+1);
		data bef&j;
			set raw.bef&j(keep = pnr familie_id);
		aar = &i;
		proc sort; by pnr;
		run;

*MERGE OF MORTGAGE AND POPULATION DATA;
		proc sort data = real&i; by pnr; run;

		data real_bef&i;
			merge real&i (in = a)
				  bef&j (in = b);
			by aar pnr;
			if a and b;
		run;
	%end;

	%do i = &startaar+1 %to &slutaar;
	%let j = %eval(&i+1);
*MERGE OF SAMPLE OG POPULATION DATA;
*Sample starts in 2010 with the population of IO loans that was there in 2009 as well, and that has an IO loan with at least 9 years of IO;
		proc sort data = sample_&i; by pnr; run;

		data sample_bef&i;
			merge sample_&i (in = a)
				  bef&j (in = b);
			by aar pnr;
			if a and b;
			proc sort; by laannr;
		run;
	%end;
%mend bef;
%bef; 

*APPENDING YEARLY MORTGAGE DATASETS;
proc datasets library=work;
	delete real_laan;
run;
data real_laan;
	set real_bef&startaar - real_bef&slutaar;
run;

*APPENDING YEARLY DATASETS THAT DEFINE THE SAMPLE;
proc datasets library=work;
	delete sample_sort1;
run;
data sample_sort1(drop = pnr);
	set sample_bef&startaar_p1 - sample_bef&slutaar;
	proc sort; by aar laannr familie_id;
run;

*SORTING TO MAKE SURE WE ONLY INCLUDE A LOAN ONCE (IN CASE OF MORE DEBTORS);
data sample_sort2;
	set	sample_sort1;
	by aar laannr familie_id;
	if first.laannr; 
	proc sort; by aar familie_id descending hovedobl;
run;

*IF A FAMILY HAS MORE THAN ONE IO LOAN IN THE SAME YEAR, WE ONLY INCLUDE THE LARGEST;
data sample;
	set sample_sort2;
	by aar familie_id descending hovedobl;
	if first.familie_id; 
run;

proc freq data=sample; 
	table overlev_lastyear*treat_aar / norow nopercent; 
run;
proc freq data=sample; table y_laanoptag; run;

*USING FAMILIEDATA (HOUSEHOLD DATASET) TO GET BACKGROUND INFORMATION) FROM YEAR 1998;
data familiedata;
	set databib.familiedata(where=(aar >= 1998 and aar<=2017));
	fam_finaktiver = sum(famindestpi,famkursakt,famoblakt);
proc sort; by familie_id aar;
run;

*GROUPING MORTGAGE DATASET ON YEAR-HOUSEHOLD LEVEL;
proc sql;
	create table laansammeaar as
		select
			aar
			, familie_id
			, max(nytlaan) as nytlaan
			, max(nytlaan_var_uden_afdrag) as nytlaan_var_uden_afdrag
			, max(nytlaan_var_med_afdrag) as nytlaan_var_med_afdrag
			, max(nytlaan_fast_uden_afdrag) as nytlaan_fast_uden_afdrag
			, max(nytlaan_fast_med_afdrag) as nytlaan_fast_med_afdrag
		from real_laan
	group by familie_id, aar;
quit;

*PUTTING DATA INTO ONE MASTER DATA;
*INPUT: 	sample (year-household level. Contains all families with at least one IO loan with at least 9 years IO period. 2013-2017)
			familiedata (year-household level. Contains household level background information. 1998-2017)
			laansammeaar (year-household level. Contains information on new mortgage loans. 2009-2017);
*OUTPUT:	master;
proc sort data=sample; by familie_id aar; run;
data help_1; *Step is made to make sure background information is included for the sample only, and such that we get background information back in time;
	merge sample (in=a keep=familie_id)
		  familiedata  (in=b);
	by familie_id;
	if a and b;
run;

proc sql;
	create table master as
		select 
			  a.*
			, b.*
			, c.*
		from
			help_1 a 
			left join laansammeaar b on a.familie_id = b.familie_id and a.aar = b.aar
			left join sample c on a.familie_id = c.familie_id and a.aar = c.aar;
quit;
proc freq data=master; table overlev_lastyear*treat_aar /norow nopercent; run;

data save.master;
	set master;
run;
proc export data=master outfile="E:\ProjektDB\706172\Workdata\706172\Husholdningsprojekt\Mortgage Choice and Expenditure\Data/master.dta" replace;
quit;

